Problem

Today we are going to fit a simple regression model with yearly population to year, by each country of the world. The data is available on the gapminder package. And we will use functional programming to apply regression model to each country with an easy 10 lines of code.

Plot the data by Country and Continent

require(tidyverse)
require(gapminder)
head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
gapminder %>% 
  ggplot(aes(year, pop, color = country)) +
  geom_line(show.legend = F) +
  facet_wrap(~continent, scales = "free_y") +
  scale_y_log10() # Took log to make the y axis readable

We can clearly see some difference within and between continents. There is also a linear pattern in increase of population with year. Lets fit the regression model and check which countries and continent have highest yearly population growth rate.

Fit the Regression models by each country

We are going to use the map function from the purrr package to apply regression model to each country.

require(broom)
require(purrr)

reg_models <- gapminder %>%  
  group_by(country, continent) %>% 
  nest() %>% 
  mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>% 
  mutate(coefs = map(models, ~tidy(.x))) %>% 
  unnest(coefs) 

### FItted Models and Coefficients
reg_models     
## # A tibble: 284 x 9
## # Groups:   country, continent [142]
##    country  continent data    models term  estimate std.error statistic  p.value
##    <fct>    <fct>     <list>  <list> <chr>    <dbl>     <dbl>     <dbl>    <dbl>
##  1 Afghani~ Asia      <tibbl~ <lm>   (Int~  -6.91e8    1.05e8     -6.55 6.48e- 5
##  2 Afghani~ Asia      <tibbl~ <lm>   year    3.57e5    5.33e4      6.70 5.37e- 5
##  3 Albania  Europe    <tibbl~ <lm>   (Int~  -8.75e7    3.95e6    -22.1  7.90e-10
##  4 Albania  Europe    <tibbl~ <lm>   year    4.55e4    2.00e3     22.8  5.94e-10
##  5 Algeria  Africa    <tibbl~ <lm>   (Int~  -9.16e8    4.25e7    -21.5  1.04e- 9
##  6 Algeria  Africa    <tibbl~ <lm>   year    4.73e5    2.15e4     22.0  8.39e-10
##  7 Angola   Africa    <tibbl~ <lm>   (Int~  -2.78e8    2.11e7    -13.2  1.22e- 7
##  8 Angola   Africa    <tibbl~ <lm>   year    1.44e5    1.07e4     13.5  9.51e- 8
##  9 Argenti~ Americas  <tibbl~ <lm>   (Int~  -7.99e8    1.52e7    -52.5  1.53e-13
## 10 Argenti~ Americas  <tibbl~ <lm>   year    4.18e5    7.69e3     54.3  1.08e-13
## # ... with 274 more rows

Plot the Estimated Growth Rate

g1 <- reg_models %>% 
  filter(term == "year") %>% 
  arrange(estimate) %>% 
  unnest(data) %>% 
  group_by(country, continent, estimate) %>% 
  nest() %>% 
  ggplot(aes(x = reorder(country, estimate), y = estimate, fill = continent)) +
  geom_col() + 
  scale_y_log10() +
  theme_light() +
  theme(axis.text.x = element_text(size=3, angle = 90),
        panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()) +
  labs(title = "Estimated per year Population Growth Rate by Country",
       x = "Country",
       y = "Coefficient / Estimate on log_10 scale",
       fill = "Continent")
 
plotly::ggplotly(g1)

Looks like Asia, Africa and South American countries have the highest population growth rates.

Pvalue Vs Estimate in log scale

Taking log scale to make the plot readable.

require(ggrepel)
reg_models %>% 
  filter(term == "year") %>% 
  ggplot(aes(x = estimate, y = p.value, color = continent)) +
  geom_point(show.legend = F) +
  geom_hline(yintercept = 0.05, linetype = "dashed", alpha = 0.5) +
#  geom_vline(xintercept = 0.00001   , linetype = "dashed", alpha = 0.5) +
  xlab("Coefficient Value") +
  ylab("P-value in Log10 Scale") +
  scale_y_log10() +
  scale_x_log10() +
  geom_label_repel(aes(label = country), size = 2.5, alpha = 0.5, show.legend = F, max.overlaps = 20) +
  facet_wrap(~continent, scales = "free") +
  theme_light()

Digging Deep to what’s happening with the code

### First Group by Country and COntinent
gapminder %>%  
  group_by(country, continent)
## # A tibble: 1,704 x 6
## # Groups:   country, continent [142]
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows
### The nest the data
gapminder %>%  
  group_by(country, continent) %>% 
  nest() 
## # A tibble: 142 x 3
## # Groups:   country, continent [142]
##    country     continent data             
##    <fct>       <fct>     <list>           
##  1 Afghanistan Asia      <tibble [12 x 4]>
##  2 Albania     Europe    <tibble [12 x 4]>
##  3 Algeria     Africa    <tibble [12 x 4]>
##  4 Angola      Africa    <tibble [12 x 4]>
##  5 Argentina   Americas  <tibble [12 x 4]>
##  6 Australia   Oceania   <tibble [12 x 4]>
##  7 Austria     Europe    <tibble [12 x 4]>
##  8 Bahrain     Asia      <tibble [12 x 4]>
##  9 Bangladesh  Asia      <tibble [12 x 4]>
## 10 Belgium     Europe    <tibble [12 x 4]>
## # ... with 132 more rows
### Apply Linear Regression to the column data
### Inside the data, there are columns called pop and year
### .x means take everything from each tibble of the data
gapminder %>%  
  group_by(country, continent) %>% 
  nest() %>% 
  mutate(models = map(data, ~lm(pop ~ year, data = .x)))
## # A tibble: 142 x 4
## # Groups:   country, continent [142]
##    country     continent data              models
##    <fct>       <fct>     <list>            <list>
##  1 Afghanistan Asia      <tibble [12 x 4]> <lm>  
##  2 Albania     Europe    <tibble [12 x 4]> <lm>  
##  3 Algeria     Africa    <tibble [12 x 4]> <lm>  
##  4 Angola      Africa    <tibble [12 x 4]> <lm>  
##  5 Argentina   Americas  <tibble [12 x 4]> <lm>  
##  6 Australia   Oceania   <tibble [12 x 4]> <lm>  
##  7 Austria     Europe    <tibble [12 x 4]> <lm>  
##  8 Bahrain     Asia      <tibble [12 x 4]> <lm>  
##  9 Bangladesh  Asia      <tibble [12 x 4]> <lm>  
## 10 Belgium     Europe    <tibble [12 x 4]> <lm>  
## # ... with 132 more rows
### We created the model column
### Now we tidy the model column
### This created tibbles of coefficients foe each country
gapminder %>%  
  group_by(country, continent) %>% 
  nest() %>% 
  mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>% 
  mutate(coefs = map(models, ~tidy(.x)))
## # A tibble: 142 x 5
## # Groups:   country, continent [142]
##    country     continent data              models coefs           
##    <fct>       <fct>     <list>            <list> <list>          
##  1 Afghanistan Asia      <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  2 Albania     Europe    <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  3 Algeria     Africa    <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  4 Angola      Africa    <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  5 Argentina   Americas  <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  6 Australia   Oceania   <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  7 Austria     Europe    <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  8 Bahrain     Asia      <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
##  9 Bangladesh  Asia      <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
## 10 Belgium     Europe    <tibble [12 x 4]> <lm>   <tibble [2 x 5]>
## # ... with 132 more rows
### Finally unnest the coefs column to get our estimates
gapminder %>%  
  group_by(country, continent) %>% 
  nest() %>% 
  mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>% 
  mutate(coefs = map(models, ~tidy(.x))) %>% 
  unnest(coefs) 
## # A tibble: 284 x 9
## # Groups:   country, continent [142]
##    country  continent data    models term  estimate std.error statistic  p.value
##    <fct>    <fct>     <list>  <list> <chr>    <dbl>     <dbl>     <dbl>    <dbl>
##  1 Afghani~ Asia      <tibbl~ <lm>   (Int~  -6.91e8    1.05e8     -6.55 6.48e- 5
##  2 Afghani~ Asia      <tibbl~ <lm>   year    3.57e5    5.33e4      6.70 5.37e- 5
##  3 Albania  Europe    <tibbl~ <lm>   (Int~  -8.75e7    3.95e6    -22.1  7.90e-10
##  4 Albania  Europe    <tibbl~ <lm>   year    4.55e4    2.00e3     22.8  5.94e-10
##  5 Algeria  Africa    <tibbl~ <lm>   (Int~  -9.16e8    4.25e7    -21.5  1.04e- 9
##  6 Algeria  Africa    <tibbl~ <lm>   year    4.73e5    2.15e4     22.0  8.39e-10
##  7 Angola   Africa    <tibbl~ <lm>   (Int~  -2.78e8    2.11e7    -13.2  1.22e- 7
##  8 Angola   Africa    <tibbl~ <lm>   year    1.44e5    1.07e4     13.5  9.51e- 8
##  9 Argenti~ Americas  <tibbl~ <lm>   (Int~  -7.99e8    1.52e7    -52.5  1.53e-13
## 10 Argenti~ Americas  <tibbl~ <lm>   year    4.18e5    7.69e3     54.3  1.08e-13
## # ... with 274 more rows